Exploring associations of miRNA expression levels across tumor types in TCGA with suggested correlates defined by the Immune Response Working Group:
Data are stored on Synapse in the folder syn6171109.
Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn7222010) and can be loaded with read_tsv().
# load sample data
mirna_sample_file <- mirna_files %>%
filter(file.id == "syn7222010") %>%
.[["file_path"]]
mirna_sample_df <- read_tsv(mirna_sample_file)
Parsed with column specification:
cols(
id = col_character(),
Disease = col_character(),
Sample_Type = col_integer(),
Protocol = col_character(),
Platform = col_character()
)
Sample Quality Annotations: syn4551248 (“merged_sample_quality_annotations.tsv”)
sample_qual_file <- synGet("syn4551248", downloadLocation = "../data/tcga/")
sample_qual_df <- sample_qual_file %>%
getFileLocation() %>%
read_tsv()
Parsed with column specification:
cols(
patient_barcode = col_character(),
aliquot_barcode = col_character(),
`cancer type` = col_character(),
platform = col_character(),
patient_annotation = col_character(),
sample_annotation = col_character(),
aliquot_annotation = col_character(),
aliquot_annotation_updated = col_character(),
AWG_excluded_because_of_pathology = col_double(),
AWG_pathology_exclusion_reason = col_character(),
Reviewed_by_EPC = col_double(),
Do_not_use = col_character()
)
remove samples based on Do_not_use=True, and remove cases with AWG_excluded_because_of_pathology=True
# samples to exclude from all datasets
exclude_samples <- sample_qual_df %>%
mutate(AWG_excluded_because_of_pathology = parse_logical(AWG_excluded_because_of_pathology),
Do_not_use = parse_logical(str_to_upper(Do_not_use))) %>%
filter(AWG_excluded_because_of_pathology | Do_not_use)
Remove samples from miRNA dataset.
mirna_sample_df <- mirna_sample_df %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
miRNA normalized, batch corrected expression values for all samples are stored as a matrix in a CSV file (Synapse ID: syn7201053) and can be loaded with read_csv().
mirna_corr_file <- "../data/intermediate/mirna_correlate_data.feather"
force_read <- FALSE
if (!file.exists(mirna_corr_file) | force_read) {
# load normalized, batch-corrected expression data
mirna_norm_file <- mirna_files %>%
filter(file.id == "syn7201053") %>%
.[["file_path"]]
mirna_norm_df <- read_csv(mirna_norm_file, progress = FALSE)
mirna_corr_df <- mirna_norm_df %>%
select(one_of(c("Genes", mirna_sample_pt_df$id)))
write_feather(mirna_corr_df, mirna_corr_file)
rm(mirna_norm_df)
} else {
mirna_corr_df <- read_feather(mirna_corr_file)
}
tcga_colors <- tribble(
~Color, ~Disease,
"#ED2891", "BRCA",
"#B2509E", "GBM",
"#D49DC7", "LGG",
"#C1A72F", "ACC",
"#E8C51D", "PCPG",
"#F9ED32", "THCA",
"#104A7F", "CHOL",
"#9EDDF9", "COAD",
"#007EB5", "ESCA",
"#CACCDB", "LIHC",
"#6E7BA2", "PAAD",
"#DAF1FC", "READ",
"#00AEEF", "STAD",
"#F6B667", "CESC",
"#D97D25", "OV",
"#FBE3C7", "UCEC",
"#F89420", "UCS",
"#97D1A9", "HNSC",
"#009444", "UVM",
"#754C29", "LAML",
"#CEAC8F", "THYM",
"#3953A4", "DLBC",
"#BBD642", "SKCM",
"#00A99D", "SARC",
"#D3C3E0", "LUAD",
"#A084BD", "LUSC",
"#542C88", "MESO",
"#FAD2D9", "BLCA",
"#ED1C24", "KICH",
"#F8AFB3", "KIRC",
"#EA7075", "KIRP",
"#7E1918", "PRAD",
"#BE1E2D", "TGCT"
)
mirna_sample_df <- mirna_sample_df %>%
mutate(Disease = factor(Disease, levels = tcga_colors$Disease))
The table mirna_sample_df contains 5 columns describing the 10543 samples in the data. The Sample_Type column corresponds to TCGA sample type codes, which are defined here. The following sample types are included in the miRNA data:
mirna_sample_df %>%
group_by(Sample_Type) %>%
tally()
Based on the codes, this is the distribution of sample types:
mirna_sample_df %>%
group_by(Sample_Type) %>%
tally() %>%
mutate(Definition = case_when(
.$Sample_Type == 1 ~ "Primary Solid Tumor",
.$Sample_Type == 2 ~ "Recurrent Solid Tumorr",
.$Sample_Type == 3 ~ "Primary Blood Derived Cancer - Peripheral Blood",
.$Sample_Type == 5 ~ "Additional - New Primary",
.$Sample_Type == 6 ~ "Metastatic",
.$Sample_Type == 7 ~ "Additional Metastatic",
.$Sample_Type == 11 ~ "Solid Tissue Normal"
))
Note: only include “primary” tumor samples for now; also, add additional column to store vial ID (to ease mapping between data sets).
mirna_sample_pt_df <- mirna_sample_df %>%
filter(Sample_Type %in% c(1, 3, 5)) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", ""))
Additionally, the following disease types are included:
mirna_sample_pt_df %>%
group_by(Disease) %>%
tally()
Note: exclude LAML, THYM, and DLBC from cell content correlations
And technical variables:
mirna_sample_pt_df %>%
group_by(Protocol, Platform) %>%
tally()
Expression values in the data are reportedly reads per million (RPM). I’ve randomly selected a few samples from each disease type, sample type, protocol, and platform to inspect the distribution of expression values across all 743 miRNA genes.
set.seed(0)
# randomly select 1-3 samples from each combination of characteristics
sample_sub_df <- mirna_sample_pt_df %>%
group_by(Disease, Sample_Type, Protocol, Platform) %>%
sample_n(3, replace = TRUE) %>%
ungroup() %>%
distinct()
# subset and melt the expression data
mirna_sub_df <- mirna_corr_df %>%
select(one_of(c("Genes", sample_sub_df$id))) %>%
gather("sample", "expression", -Genes)
Even with a shifted log (log10(x + 1)) transformation, expression values look to be more exponentially distributed than normal.
mirna_sub_df %>%
left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>%
ggplot(aes(x = log10(expression + 1))) +
stat_density(aes(group = sample), geom = "line", position = "identity",
alpha = 0.2)
I can also look at the distribution of log-RPM values with boxplots:
mirna_sub_df %>%
left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>%
ggplot(aes(x = sample, y = log10(expression + 1))) +
geom_boxplot(aes(fill = Disease), outlier.size = 0.5) +
theme(axis.text.x = element_blank()) +
scale_fill_manual(values = tcga_colors$Color)
# convert expression df to matrix
mirna_corr_mat <- mirna_corr_df %>%
select(one_of(c("Genes", mirna_sample_pt_df$id))) %>%
column_to_rownames("Genes") %>%
as.matrix()
I used the prcomp() function on the transposed expression matrix (samples x genes, after transposing) to compute PCA data, which I can use to look for batch effects among samples.
mirna_corr_pca <- mirna_corr_mat %>%
t() %>%
prcomp()
(I can use the broom:tidy() function to convert data in the prcomp object into data frames for ggplot.)
pc_df <- mirna_corr_pca %>%
tidy("pcs")
mirna_corr_pca_df <- mirna_corr_pca %>%
tidy("samples") %>%
filter(PC <= 2) %>%
left_join(mirna_sample_pt_df, by = c("row" = "id"))
rm(mirna_corr_mat)
The plot below shows samples plotted as points along the first two principle components (PCs). Points are colored by disease type.
pc1_label <- pc_df %>%
filter(PC == 1) %>%
transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>%
flatten_chr()
pc2_label <- pc_df %>%
filter(PC == 2) %>%
transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>%
flatten_chr()
mirna_corr_pca_df %>%
spread(PC, value) %>%
ggplot(aes(x = `1`, y = `2`)) +
geom_point(aes(colour = Disease)) +
xlab(pc1_label) +
ylab(pc2_label) +
scale_color_manual(values = tcga_colors$Color)
Cellular Content: syn7994728 syn5808205 (“TCGA_all_leuk_estimate.masked.20170107.tsv”)
# file_data <- synGet("syn5808205", downloadLocation = "./")
leuk_frac_file <- "../data/tcga/TCGA_all_leuk_estimate.masked.20170107.tsv"
leuk_frac_df <- read_tsv(leuk_frac_file, col_names = FALSE) %>%
set_names(c("disease", "id", "leuk_frac"))
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_double()
)
leuk_frac_df <- leuk_frac_df %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and leukocyte fraction.
leuk_frac_ids <- leuk_frac_df %>%
select(id) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
mirna_ids <- mirna_sample_pt_df %>%
select(id, vial_id) %>%
arrange()
mirna_leuk_frac_shared_ids <- inner_join(mirna_ids, leuk_frac_ids,
by = "vial_id",
suffix = c("_mirna", "_leuk_frac"))
# only keep samples with matched vial ID AND portion number
portion_id_minus_analyte_regex <- "([:alnum:]+\\-){4}[0-9]+"
# plate_id_only_regex <- "[:alnum:]+(?=([:alnum:]\\-[:alnum:]+){1}$)"
mirna_leuk_frac_shared_ids <- mirna_leuk_frac_shared_ids %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_leuk_frac, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the leukocyte fraction across these before computing correlations with miRNA
leuk_frac_corr_file <- "../data/intermediate/leuk_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(leuk_frac_corr_file) | force_format) {
leuk_frac_corr_df <- mirna_leuk_frac_shared_ids %>%
left_join(leuk_frac_df, by = c("id_leuk_frac" = "id")) %>%
group_by(disease, vial_id) %>%
summarise(leuk_frac = mean(leuk_frac)) %>%
ungroup()
write_feather(leuk_frac_corr_df, leuk_frac_corr_file)
} else {
leuk_frac_corr_df <- read_feather(leuk_frac_corr_file)
}
mirna_leuk_frac_corr_file <- "../results/mirna_leuk_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_leuk_frac_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% leuk_frac_corr_df$disease) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(leuk_frac_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- leuk_frac_corr_df %>%
filter(vial_id %in% samples_d) %>%
dplyr::rename(sample = vial_id, x = leuk_frac) %>%
mutate(correlate = "leuk_frac")
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_leuk_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "leukocyte fraction")
write_feather(mirna_leuk_frac_corr_df, mirna_leuk_frac_corr_file)
} else {
mirna_leuk_frac_corr_df <- read_feather(mirna_leuk_frac_corr_file)
}
Cellular Content: syn4991611 syn8024565 (“TCGA.cluster-by-CIBERSORT-relative.tsv”)
or…?
syn7337221 (“TCGA.Kallisto.fullIDs.cibersort.relative.tsv”)
# ciber_frac_file <- "../data/tcga/TCGA.cluster-by-CIBERSORT-relative.tsv"
ciber_frac_file <- "../data/tcga/TCGA.Kallisto.fullIDs.cibersort.relative.tsv"
ciber_frac_df <- read_tsv(ciber_frac_file)
Parsed with column specification:
cols(
.default = col_double(),
SampleID = col_character(),
CancerType = col_character()
)
See spec(...) for full column specifications.
ciber_frac_df <- ciber_frac_df %>%
mutate(id = str_replace_all(SampleID, "\\.", "\\-")) %>%
filter(!(id %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and CIBERSORT fraction.
ciber_frac_ids <- ciber_frac_df %>%
select(id) %>%
mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
# only keep samples with matched vial ID AND portion number
mirna_ciber_frac_shared_ids <- inner_join(mirna_ids, ciber_frac_ids,
by = "vial_id",
suffix = c("_mirna", "_ciber_frac")) %>%
distinct() %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_ciber_frac, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the CIBERSORT fraction across these before computing correlations with miRNA
ciber_frac_corr_file <- "../data/intermediate/ciber_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(ciber_frac_corr_file) | force_format) {
ciber_frac_corr_df <- mirna_ciber_frac_shared_ids %>%
left_join(ciber_frac_df, by = c("id_ciber_frac" = "id")) %>%
select(-id_mirna, -id_ciber_frac,
-SampleID, -P.value, -Correlation, -RMSE) %>%
group_by(CancerType, vial_id) %>%
summarise_each(funs(mean)) %>%
ungroup()
write_feather(ciber_frac_corr_df, ciber_frac_corr_file)
} else {
ciber_frac_corr_df <- read_feather(ciber_frac_corr_file)
}
mirna_ciber_frac_corr_file <- "../results/mirna_ciber_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_ciber_frac_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% ciber_frac_corr_df$CancerType) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(ciber_frac_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- ciber_frac_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-CancerType) %>%
gather(correlate, x, -vial_id) %>%
dplyr::rename(sample = vial_id)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_ciber_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "CIBERSORT fraction")
write_feather(mirna_ciber_frac_corr_df, mirna_ciber_frac_corr_file)
} else {
mirna_ciber_frac_corr_df <- read_feather(mirna_ciber_frac_corr_file)
}
Mutation Load: syn5706827 syn7994728 (“mutation-load”)
mut_load_file <- "../data/tcga/mutation-load-vial.txt"
mut_load_df <- read_tsv(mut_load_file)
Parsed with column specification:
cols(
cohort = col_character(),
Patient_ID = col_character(),
Tumor_Sample_ID = col_character(),
Tumor_Sample_ID_Vial = col_character(),
`Silent per Mb` = col_double(),
`Non-silent per Mb` = col_double()
)
Can’t filter aliquots, as data only includes IDs specific to the level of vial
Identify matched samples between miRNA and mutational load.
mut_load_ids <- mut_load_df %>%
select(id = Tumor_Sample_ID_Vial) %>%
mutate(vial_id = id) %>%
arrange()
mirna_mut_load_shared_ids <- inner_join(mirna_ids, mut_load_ids,
by = "vial_id",
suffix = c("_mirna", "_mut_load"))
mut_load_corr_file <- "../data/intermediate/mut_load_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mut_load_corr_file) | force_format) {
mut_load_corr_df <- mirna_mut_load_shared_ids %>%
left_join(mut_load_df, by = c("id_mut_load" = "Tumor_Sample_ID_Vial"))
write_feather(mut_load_corr_df, mut_load_corr_file)
} else {
mut_load_corr_df <- read_feather(mut_load_corr_file)
}
mirna_mut_load_corr_file <- "../results/mirna_mut_load_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mut_load_corr_file) | force_compute) {
# skip immune cell cancers?
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% mut_load_corr_df$cohort) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(mut_load_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- mut_load_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-id_mirna, -id_mut_load, -cohort, -Patient_ID,
-Tumor_Sample_ID) %>%
gather(correlate, x, -vial_id) %>%
dplyr::rename(sample = vial_id)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_mut_load_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "mutation load")
write_feather(mirna_mut_load_corr_df, mirna_mut_load_corr_file)
} else {
mirna_mut_load_corr_df <- read_feather(mirna_mut_load_corr_file)
}
T Cell Receptor / Brown et al: syn5876488 syn7063422 (“mitcr_sampleStatistics_20160714.tsv”)
tcr_div_file <- "../data/tcga/mitcr_sampleStatistics_20160714.tsv"
tcr_div_df <- read_tsv(tcr_div_file)
Parsed with column specification:
cols(
AliquotBarcode = col_character(),
Study = col_character(),
SampleTypeLetterCode = col_character(),
ParticipantBarcode = col_character(),
SampleBarcode = col_character(),
CGHub_analysis_id = col_character(),
totTCR_reads = col_integer(),
totTCRa_reads = col_integer(),
totTCRb_reads = col_integer(),
shannon = col_double(),
numClones = col_integer()
)
tcr_dv_df <- tcr_div_df %>%
filter(!(AliquotBarcode %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and CIBERSORT fraction.
tcr_div_ids <- tcr_div_df %>%
select(id = AliquotBarcode, vial_id = SampleBarcode) %>%
arrange()
# only keep samples with matched vial ID AND portion number
mirna_tcr_div_shared_ids <- inner_join(mirna_ids, tcr_div_ids,
by = "vial_id",
suffix = c("_mirna", "_tcr_div")) %>%
distinct() %>%
filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
== str_extract(id_tcr_div, portion_id_minus_analyte_regex)))
NOTE: several samples were assayed on multiple plates; average the TCR diversity across these before computing correlations with miRNA
Some samples (AliquotBarcode) have 2 values for Shannon entropy, apparently corresponding to separate analyses on CGHub. I’ll go ahead and average these values per aliquot, prior to averaging Shannon values per vial ID. As a small measure of QC, I’ll also only keep observations with at least 1 TCRA read or 1 TCRB read.
tcr_div_corr_file <- "../data/intermediate/tcr_div_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(tcr_div_corr_file) | force_format) {
tcr_div_corr_df <- mirna_tcr_div_shared_ids %>%
left_join(tcr_div_df, by = c("id_tcr_div" = "AliquotBarcode")) %>%
select(-CGHub_analysis_id) %>%
distinct() %>%
filter(totTCRa_reads > 0 | totTCRb_reads > 0) %>%
group_by(Study, vial_id, id_tcr_div) %>%
summarize(shannon = mean(shannon)) %>%
group_by(Study, vial_id) %>%
summarise(shannon = mean(shannon)) %>%
ungroup()
write_feather(tcr_div_corr_df, tcr_div_corr_file)
} else {
tcr_div_corr_df <- read_feather(tcr_div_corr_file)
}
mirna_tcr_div_corr_file <- "../results/mirna_tcr_div_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_tcr_div_corr_file) | force_compute) {
# skip immune cell cancers
d_list <- mirna_sample_pt_df %>%
# filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
filter(Disease %in% tcr_div_corr_df$Study) %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(tcr_div_corr_df$vial_id)
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- tcr_div_corr_df %>%
filter(vial_id %in% samples_d) %>%
select(-Study) %>%
dplyr::rename(sample = vial_id) %>%
gather(correlate, x, -sample)
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_tcr_div_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "TCR diversity")
write_feather(mirna_tcr_div_corr_df, mirna_tcr_div_corr_file)
} else {
mirna_tcr_div_corr_df <- read_feather(mirna_tcr_div_corr_file)
}
Batch effects normalized mRNA data: syn4976363 syn4976369 (“EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv”) syn4976366 (" EB++GeneExpAnnotation.tsv“)
Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn4976366) and can be loaded with read_tsv().
# load sample data
mrna_sample_file <- mrna_files %>%
filter(file.id == "syn4976366") %>%
.[["file_path"]]
mrna_sample_df <- read_tsv(mrna_sample_file)
Parsed with column specification:
cols(
SampleID = col_character(),
Center = col_character(),
platform = col_character(),
Adjustment = col_character()
)
Remove samples from mRNA dataset.
mrna_sample_df <- mrna_sample_df %>%
filter(!(SampleID %in% exclude_samples$aliquot_barcode))
Identify matched samples between miRNA and mRNA.
mrna_ids <- mrna_sample_df %>%
select(SampleID) %>%
mutate(vial_id = str_replace(SampleID, "(\\-[:alnum:]+){3}$", "")) %>%
arrange()
mirna_mrna_shared_ids <- inner_join(mirna_ids, mrna_ids, by = "vial_id")
# only keep samples with matched vial ID AND portion number
mirna_mrna_shared_ids <- mirna_mrna_shared_ids %>%
filter(str_extract(id, portion_id_minus_analyte_regex)
== str_extract(SampleID, portion_id_minus_analyte_regex))
mRNA normalized, batch corrected expression values for all samples are stored as a matrix in a TSV file (Synapse ID: syn4976369) and can be loaded with read_tsv().
List of genes accessed here and saved as a TSV at data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv:
gene_correlate_file <- "../data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv"
gene_correlate_df <- read_tsv(gene_correlate_file)
Parsed with column specification:
cols(
Gene = col_character(),
`HGNC Symbol` = col_character(),
`Entrez ID` = col_integer(),
`Gene Family` = col_character(),
`Immune Checkpoint` = col_character(),
matched_name = col_character(),
group = col_character()
)
3 parsing failures.
row col expected actual
20 -- 7 columns 6 columns
61 -- 7 columns 6 columns
64 -- 7 columns 6 columns
mrna_corr_file <- "../data/intermediate/mrna_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mrna_corr_file) | force_format) {
# load normalized, batch-corrected expression data
mrna_norm_file <- mrna_files %>%
filter(file.id == "syn4976369") %>%
.[["file_path"]]
mrna_norm_df <- read_tsv(mrna_norm_file, progress = FALSE)
# gene_list <- c("IFNG", "PRF1", "GZMA", "PDCD1", "CD274", "PDCD1LG2", "IL10",
# "TGFB1", "IDO1", "HLA-A")
mrna_corr_df <- mrna_norm_df %>%
separate(gene_id, c("gene_name", "gene_id"), sep = "\\|") %>%
filter((gene_id %in% gene_correlate_df$`Entrez ID`)
| (gene_name %in% gene_correlate_df$`HGNC Symbol`)) %>%
select(one_of(c("gene_name", "gene_id",
mirna_mrna_shared_ids$SampleID)))
write_feather(mrna_corr_df, mrna_corr_file)
rm(mrna_norm_df)
} else {
mrna_corr_df <- read_feather(mrna_corr_file)
}
mirna_mrna_corr_file <- "../results/mirna_mrna_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mrna_corr_file) | force_compute) {
d_list <- mirna_sample_pt_df %>%
distinct(Disease) %>%
mutate(Disease = as.character(Disease)) %>%
.$Disease %>%
set_names(.) %>%
as.list()
corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
samples_d <- mirna_sample_pt_df %>%
filter(Disease == d) %>%
select(vial_id) %>%
flatten_chr() %>%
intersect(str_replace(names(mrna_corr_df),
"(\\-[:alnum:]+){3}$", ""))
source_df <- mirna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("Genes", samples_d))) %>%
dplyr::rename(mirna = Genes) %>%
gather(sample, x, -mirna) %>%
filter(!is.na(x))
target_df <- mrna_corr_df %>%
set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
select(one_of(c("gene_name", samples_d))) %>%
dplyr::rename(correlate = gene_name) %>%
gather(sample, x, -correlate) %>%
filter(!is.na(x))
corr_df <- inner_join(source_df, target_df,
by = "sample",
suffix = c("_source", "_target")) %>%
group_by(mirna, correlate) %>%
do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>%
ungroup()
corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
return(corr_df)
})
mirna_mrna_corr_df <- bind_rows(corr_df_list, .id = "disease") %>%
mutate(correlate_type = "mRNA expression")
write_feather(mirna_mrna_corr_df, mirna_mrna_corr_file)
} else {
mirna_mrna_corr_df <- read_feather(mirna_mrna_corr_file)
}
get_cet_pal <- function(cet_path) {
read_lines(cet_path) %>%
as.list() %>%
map_chr(function(c) {
c_rgb <- str_split_fixed(string = c, pattern = ",", n = 3)
rgb(c_rgb[1], c_rgb[2], c_rgb[3], maxColorValue = 255)
})
}
# here's an example with the diverging blue-to yellow colormap:
cet_path <- "~/Downloads/CETperceptual_csv_0_255/diverging-linear_bjy_30-90_c45_n256.csv"
my_cet_pal <- get_cet_pal(cet_path)
mirna_causal_df <- readxl::read_excel("../data/causal_TCGA_panCancer_miRNA_immuneInfiltrate_3_15_2017.xlsx") %>%
.[, 1:9] %>%
mutate(mirna_disease = str_c(`miRNA Name`, `Tumor Type`, sep = "_"))
mirna_immune_df <- readxl::read_excel("../data/Paladini_immune_miRNAs.xlsx")
mirna_immune_tidy_df <- mirna_immune_df %>%
mutate(mirna_group = str_split(MicroRNAs, ",")) %>%
unnest(mirna_group) %>%
mutate(mirna_group = str_trim(mirna_group, "both")) %>%
# distinct(mirna_group) %>%
mutate(mirna = mirna_group) %>%
# filter(Category %in% c("Innate immunity", "Adaptive immunity")) %>%
# filter(str_detect(mirna, "cluster")) %>%
filter(!str_detect(mirna_group, "[0-9]+[a-z]{2,}")) %>%
mutate(mirna = str_replace(mirna, "miR-17/92 cluster", "miR-17,miR-18a,miR-19a,miR-20a,miR-19b-1,miR-92a-1"),
mirna = str_replace(mirna, "miR-212/132 cluster", "miR-212,miR-132"),
mirna = str_replace(mirna, "miR-10 family", "miR-10a,miR-10b,miR-99a,miR-99b,miR-100,miR-125a,miR-125b-1,miR-125b-2"),
mirna = str_replace(mirna, "miR-221/222", "miR-221,miR-222"),
mirna = str_replace(mirna, "miR-10a/b", "miR-10a,miR-10b"),
mirna = str_replace(mirna, "miR-148/152", "miR-148,miR-152"),
mirna = str_replace(mirna, "miR-17-5p/20a", "miR-17-5p,miR-20a"),
mirna = str_replace(mirna, "miR-221/222 cluster", "miR-221,miR-222"),
mirna = str_replace(mirna, "miR-181a/b", "miR-181a,miR-181b"),
mirna = str_replace(mirna, "miR-15/16", "miR-15,miR-16"),
mirna = str_replace(mirna, "miR-181 family", "miR-181a-1,miR-181a-2,miR-181b-1,miR-181b-2,miR-181c,miR-181d"),
mirna = str_replace(mirna, "miR-130/301", "miR-130,miR-301"),
mirna = str_replace(mirna, "miR-99a/miR-150", "miR-99a,miR-150"),
mirna = str_replace(mirna, "Let", "let")) %>%
mutate(mirna = str_split(mirna, ",")) %>%
unnest(mirna) %>%
# left_join(mirna_sig_corr_df %>%
# filter(correlate_type == "CIBERSORT fraction") %>%
# mutate(mirna_short = str_extract(
# mirna, "(?<=hsa\\-)[:alnum:]+\\-[:alnum:]+")
# ),
# by = c("mirna" = "mirna_short")) %>%
# filter(!is.na(disease)) %>%
# group_by(Cell_lineage, Cellular_process, mirna, group, label) %>%
# tally() %>%
# ungroup() %>%
# filter(group == "Adaptive Immune Cells",
# str_detect(Cell_lineage, "^T"),
# str_detect(label, "^T")) %>%
# distinct(Cell_lineage, label) %>%
# mutate(Cell_lineage = str_replace(Cell_lineage, "T ", "T cells "),
# Cell_lineage = str_replace(Cell_lineage, " cells$", ""),
# label = str_replace_all(label, "\\.", " "),
# label = str_replace(label, "CD4", "helper"),
# label = str_replace(label, "CD8", "cytotoxic")) %>%
# filter(!(label %in% Cell_lineage) & !(Cell_lineage %in% label)) %>%
# mutate(lineage_match = str_detect(label, Cell_lineage)) %>%
# group_by(Cell_lineage) %>%
# summarize(num_matches = sum(lineage_match)) %>%
I
mirna_target_df <- read_tsv("../data/miRDB_v5.0_prediction_result.txt", col_names = FALSE) %>%
set_names(c("mirna", "gene", "score")) %>%
filter(str_detect(mirna, "hsa"))
mirna_mrna_target_df <- read_tsv("../data/synergizer.tsv", skip = 4) %>%
mutate(refseq_mrna = str_split(refseq_mrna, " ")) %>%
unnest(refseq_mrna) %>%
filter(refseq_mrna %in% mirna_target_df$gene) %>%
left_join(mirna_target_df, by = c("refseq_mrna" = "gene")) %>%
mutate(mirna_target = str_c(mirna, hgnc_symbol, sep = "_"))
bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
mirna_mrna_corr_df) %>%
group_by(disease, correlate_type) %>%
summarise(num_correlations = length(estimate),
significant = sum(p.adjust < 0.05, na.rm = TRUE),
other = num_correlations - significant) %>%
gather(correlations, total, -disease, -correlate_type, -num_correlations) %>%
ggplot(aes(x = disease, y = total)) +
geom_col(aes(fill = disease, alpha = correlations), colour = "slategray") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(angle = 45),
legend.position = "top") +
scale_fill_manual(values = tcga_colors$Color) +
scale_alpha_manual(values = c(0.4, 1)) +
guides(fill = FALSE) +
facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
correlate_annotations <- mirna_mrna_corr_df %>%
select(disease, mirna, correlate, estimate, statistic, p.value, method, alternative, p.adjust, correlate_type) %>%
bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, .) %>%
distinct(correlate, correlate_type) %>%
left_join(gene_correlate_df,
by = c("correlate" = "matched_name")) %>%
mutate(label = ifelse(correlate_type == "mRNA expression",
`HGNC Symbol`, correlate),
label = str_replace(correlate, "\\.\\.Tregs\\.", ""),
label = ifelse(label == "leuk_frac", "Leukocyte Fraction", label),
group = ifelse(correlate_type == "leukocyte fraction", "Total Immune Cells", group),
group = ifelse(correlate_type == "mutation load", "Mutation Load", group),
group = ifelse(correlate_type == "CIBERSORT fraction" & str_detect(label, "^(T|B)\\."),
"Adaptive Immune Cells", group),
group = ifelse(correlate_type == "CIBERSORT fraction" & !str_detect(label, "^(T|B)"),
"Innate Immune Cells", group)) %>%
select(correlate, correlate_type, group, label)
mirna_sig_corr_df <- bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df,
mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
mirna_mrna_corr_df) %>%
filter(!(disease %in% c("DLBC", "THYM", "LAML"))) %>%
filter(!is.na(p.adjust),
p.adjust < 0.05) %>%
mutate(disease = factor(disease, levels = tcga_colors$Disease)) %>%
left_join(correlate_annotations, by = c("correlate", "correlate_type")) %>%
filter(!is.na(group))
mirna_sig_corr_df %>%
mutate(direction = ifelse(estimate > 0, "positive", "negative")) %>%
group_by(disease, correlate_type, direction) %>%
tally() %>%
ungroup() %>%
mutate(count = ifelse(direction == "negative", -1 * n, n)) %>%
ggplot(aes(x = disease, y = count)) +
geom_col(aes(fill = disease, alpha = direction), colour = "slategray") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_fill_manual(values = tcga_colors$Color) +
scale_alpha_manual(values = c(0.4, 1)) +
guides(fill = FALSE) +
facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
mirna_support_df <- mirna_sig_corr_df %>%
select(disease, mirna, correlate, estimate, p.adjust, correlate_type,
group, label) %>%
mutate(mirna_target = str_c(mirna, correlate, sep = "_"),
mirna_disease = str_c(mirna, disease, sep = "_"),
mirna_short = str_extract(mirna,
"(?<=hsa\\-).*"),
mirna_short_ambiguous = str_extract(mirna_short,
"[:alnum:]+\\-[:alnum:]+")) %>%
mutate(significant = TRUE,
immunomodulator = correlate_type == "mRNA expression",
infiltrate = correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction"),
leukocyte = correlate_type == "leukocyte fraction",
celltype = correlate_type == "CIBERSORT fraction",
mirbase_support = (estimate < 0) & (correlate_type == "mRNA expression") &
mirna_target %in% mirna_mrna_target_df$mirna_target,
sygnal_support = (correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction")) &
mirna_disease %in% mirna_causal_df$mirna_disease,
immune_support = mirna_short %in% mirna_immune_tidy_df$mirna |
mirna_short_ambiguous %in% mirna_immune_tidy_df$mirna,
strong = abs(estimate) > 0.5,
immune_strong = immune_support & strong)
mirna_support_df %>%
gather(flag, status, significant, strong, immune_strong,
immunomodulator, infiltrate, leukocyte, celltype,
mirbase_support, sygnal_support, immune_support) %>%
group_by(mirna, flag) %>%
summarize(support = n_distinct(disease[status])) %>%
ungroup() %>%
filter(support > 0) %>%
group_by(flag) %>%
tally()
mirna_support_df %>%
gather(flag, status, significant, strong,
immunomodulator, infiltrate, leukocyte, celltype,
mirbase_support, sygnal_support, immune_support) %>%
group_by(disease, flag) %>%
summarize(support = sum(status)) %>%
ungroup() %>%
spread(flag, support) %>%
select(one_of(c("disease", "significant", "strong", "immune_support",
"immunomodulator", "mirbase_support",
"infiltrate", "leukocyte", "celltype", "sygnal_support")))
mirna_leuk_common_df <- mirna_support_df %>%
filter(sygnal_support & immune_support) %>%
group_by(mirna, disease) %>%
summarize(num_types = n_distinct(group)) %>%
ungroup() %>%
group_by(mirna) %>%
mutate(nz_disease = n_distinct(disease[num_types > 0])) %>%
ungroup() %>%
filter(nz_disease > 2)
mirna_leuk_common_df %>%
ggplot(aes(x = disease, y = mirna)) +
geom_tile(aes(fill = num_types)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot_colors <- tcga_colors %>%
filter(Disease %in% mirna_leuk_common_df$disease)
p <- mirna_support_df %>%
filter(sygnal_support & immune_support,
mirna %in% mirna_leuk_common_df$mirna) %>%
mutate(group = fct_inorder(group),
mirna = str_extract(mirna, "miR.*"),
correlation = ifelse(estimate > 0, "positive", "negative"),
disease = factor(disease, levels = plot_colors$Disease)) %>%
ggplot(aes(y = label, x = disease)) +
geom_tile(aes(fill = disease, colour = correlation), size = 0.4) +
scale_fill_manual("Tumor Group", values = plot_colors$Color) +
scale_colour_manual("Correlation", values = c(my_cet_pal[1], my_cet_pal[256])) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE),
colour = guide_legend(override.aes = list(fill = "#CCCCCC"),
reverse = TRUE)) +
ylab("") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "top",
strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
strip.background = element_blank()) +
facet_grid(group ~ mirna, scales = "free", space = "free")
p
ggsave("../figures/mirna_leuk_supported_corr.png", p,
width = 17, height = 8, units = "cm", dpi = 300, scale = 2)
mirna_immunomod_common_df %>%
ggplot(aes(x = disease, y = mirna)) +
geom_tile(aes(fill = num_types)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot_colors <- tcga_colors %>%
filter(Disease %in% mirna_immunomod_common_df$disease)
p <- mirna_support_df %>%
filter(mirbase_support & immune_support,
mirna %in% mirna_immunomod_common_df$mirna) %>%
mutate(group = fct_inorder(group),
mirna = str_extract(mirna, "miR.*"),
correlation = ifelse(estimate > 0, "positive", "negative"),
disease = factor(disease, levels = plot_colors$Disease)) %>%
ggplot(aes(y = label, x = disease)) +
geom_tile(aes(fill = disease),
colour = my_cet_pal[1], size = 0.4) +
scale_fill_manual("Tumor Group", values = plot_colors$Color) +
guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
ylab("") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "top",
strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
strip.background = element_blank()) +
facet_grid(group ~ mirna, scales = "free", space = "free")
p
ggsave("../figures/mirna_immunomod_supported_corr.png", p,
width = 17, height = 5, units = "cm", dpi = 300, scale = 2)
mirna_all_common_df <- mirna_support_df %>%
group_by(mirna, disease) %>%
summarize(all_support = sum(mirbase_support) > 0 &
sum(sygnal_support) > 0 &
sum(immune_support) > 0) %>%
filter(all_support) %>%
group_by(mirna) %>%
mutate(nz_disease = n_distinct(disease)) %>%
filter(nz_disease > 1)
p <- mirna_support_df %>%
filter(mirna %in% mirna_all_common_df$mirna,
correlate_type != "mutation load") %>%
mutate(support = (mirbase_support | sygnal_support) & immune_support,
group = fct_inorder(group),
mirna_short = str_extract(mirna, "miR.*")) %>%
group_by(mirna, label) %>%
mutate(num_diseases = n_distinct(disease)) %>%
group_by(group) %>%
mutate(label = fct_reorder(label, num_diseases)) %>%
ungroup() %>%
ggplot(aes(y = label, x = disease)) +
geom_tile(aes(fill = estimate, colour = support, size = support)) +
scale_fill_gradientn("Correlation", colours = my_cet_pal) +
scale_colour_manual("Support", values = c("#333333", "#E69F00")) +
scale_size_manual("Support", values = c(0.1, 1)) +
ylab("") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top",
strip.text.x = element_text(face = "bold"),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
strip.background = element_blank()) +
facet_grid(group ~ mirna_short, scales = "free", space = "free")
p
ggsave("../figures/mir142_supported_corr.png", p,
width = 16, height = 16, units = "cm", dpi = 300, scale = 2)